library(GENESIS) # for example data, mixed models, etc.library(SNPRelate) # for PCA, LD pruning, etc.library(SeqArray) # working with gds fileslibrary(SeqVarTools) # for PC-Relate setuplibrary(Biobase) # working with sample annotation fileslibrary(dplyr) # tidyverse data toolslibrary(tidyr) # tidyverse data toolslibrary(ggplot2) # plottinglibrary(GGally) # plottinglibrary(RColorBrewer) # plotting (color schemes)library(gridExtra) # plotting (multiple panels)#devtools::install_github('UW-GAC/analysis_pipeline/TopmedPipeline')library(TopmedPipeline)
# genotypesgdsfmt::showfile.gds(closeall=TRUE) # make sure files are not already opengdsfile <-'../../data/1KG_phase3_subset.gds'gds <-seqOpen(gdsfile)# sample annotationannotfile <-'../../data/1KG_phase3_subset_annot.RData'annot <-getobj(annotfile)#head(pData(annot))
Identify Admixed Individuals
We’ll focus on the “ASW” population (African Ancestry in Southwest US). The anchored analysis will include CEU (Northern European from Utah) and YRI (Yoruba) as the reference individuals.
IBD analysis (KING method of moment) on genotypes:
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 1s
# of selected variants: 5,315
# of samples: 61
# of SNVs: 5,315
using 1 thread
No family is specified, and all individuals are treated as singletons.
Relationship inference in the presence of population stratification.
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:27 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:27 2024 Done.
## step 3: PC-AiR ##### partition samples into related and unrelatedsampset <-pcairPartition(kinobj=kingMat, kin.thresh=2^(-9/2),divobj=kingMat, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 61 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
sapply(sampset, length)
rels unrels
9 52
Show the code
# run PCA on unrelatedpca.unrel <-snpgdsPCA(gds, sample.id=sampset$unrels, snp.id=pruned)
Principal Component Analysis (PCA) on genotypes:
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
# of selected variants: 5,154
Excluding 161 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# of samples: 52
# of SNVs: 5,154
using 1 thread
# of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:27 2024 (internal increment: 18784)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:27 2024 Begin (eigenvalues and eigenvectors)
Tue Aug 6 13:47:27 2024 Done.
Show the code
# project values for relativessnp.load <-snpgdsPCASNPLoading(pca.unrel, gdsobj=gds)
SNP Loading:
# of samples: 52
# of SNPs: 5,154
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:27 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:27 2024 Done.
Sample Loading:
# of samples: 9
# of SNPs: 5,154
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:27 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:27 2024 Done.
Show the code
# combine unrelated and related PCs and order as in ASW ID listpcs <-rbind(pca.unrel$eigenvect, samp.load$eigenvect)rownames(pcs) <-c(pca.unrel$sample.id, samp.load$sample.id)samp.ord <-match(asw.ids, rownames(pcs))pcs <- pcs[samp.ord,]## step 3b: determine which PCs are ancestry-informative ###### (make a parallel coordinates plot)# merge data with sample annotation infopc.df <-as.data.frame(pcs)names(pc.df) <-1:ncol(pcs)pc.df$sample.id <-row.names(pcs)pc.df <-left_join(pc.df, pData(annot), by='sample.id')# plotpop.cols <-setNames(brewer.pal(12, "Paired"),c("ACB", "ASW", "CEU", "GBR", "CHB", "JPT", "CLM", "MXL", "LWK", "YRI", "GIH", "PUR"))ggparcoord(pc.df, columns=1:12, groupColumn="Population", scale="uniminmax") +scale_color_manual(values=pop.cols) +xlab("PC") +ylab("")
Show the code
p1 <-ggplot(pc.df, aes(`1`, `2`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC1', y ='PC2')p2 <-ggplot(pc.df, aes(`3`, `4`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC3', y ='PC4')p3 <-ggplot(pc.df, aes(`5`, `6`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC5', y ='PC6')grid.arrange(p1,p2,p3)
# get new partitionpcrelMat <-pcrelateToMatrix(pcrel, scaleKin=1, verbose=FALSE)sampset2 <-pcairPartition(kinobj=pcrelMat, kin.thresh=2^(-9/2),divobj=kingMat, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 61 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Principal Component Analysis (PCA) on genotypes:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
# of selected variants: 9,665
Excluding 14,975 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# of samples: 53
# of SNVs: 9,665
using 1 thread
# of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:32 2024 (internal increment: 18432)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:32 2024 Begin (eigenvalues and eigenvectors)
Tue Aug 6 13:47:32 2024 Done.
Show the code
# calculate SNP loadingssnp.load <-snpgdsPCASNPLoading(pca.naive, gdsobj=gds)
SNP Loading:
# of samples: 53
# of SNPs: 9,665
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:32 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:32 2024 Done.
Show the code
# plot set-upsnp.load.df <-as.data.frame(cbind(snp.load$snp.id,t(snp.load$snploading)))names(snp.load.df) <-c('snp.id', paste('PC',1:32,sep=''))missing <-!(readex.gdsn(index.gdsn(gds, 'variant.id')) %in% snp.load$snp.id)snp.annot <-data.frame(snp.id =readex.gdsn(index.gdsn(gds, 'variant.id')),chr =readex.gdsn(index.gdsn(gds, 'chromosome')),pos =readex.gdsn(index.gdsn(gds, 'position')),stringsAsFactors = F)snp.load.df <-left_join(snp.load.df, snp.annot, by ='snp.id')snp.load.df.long <- snp.load.df %>%pivot_longer(cols=PC1:PC32, names_to ='PC') %>%mutate(PC =factor(PC, levels =paste0("PC", 1:32))) %>%mutate(chr =factor(chr, levels =c(1:22, "X")))# set up color schemechr <-levels(snp.load.df.long$chr)cmap <-setNames(rep_len(brewer.pal(8, "Dark2"), length(chr)), chr)# plot first 4 PCs pagen_pcs <-length(unique(snp.load.df.long$PC))n_plots <-ceiling(n_pcs/as.integer(4)) # change 4 depending on how many plots per page you wantbins <-as.integer(cut(1:n_pcs, n_plots))i <-1bin <-paste0("PC", which(bins == i))dat <-filter(snp.load.df.long, PC %in% bin)ggplot(dat, aes(chr, value, group=interaction(chr, pos), color=chr)) +geom_point(position=position_dodge(0.8)) +facet_wrap(~PC, scales="free", ncol=1) +scale_color_manual(values=cmap, breaks=names(cmap)) +#ylim(0,1) +ylim(-0.05,0.05)+theme_bw() +theme(legend.position="none") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank()) +xlab("chromosome") +ylab("loading")
Warning: Removed 74 rows containing missing values or values outside the scale range
(`geom_point()`).
IBD analysis (KING method of moment) on genotypes:
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 1s
# of selected variants: 8,062
# of samples: 268
# of SNVs: 8,062
using 1 thread
No family is specified, and all individuals are treated as singletons.
Relationship inference in the presence of population stratification.
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:37 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:37 2024 Done.
## step 3: PC-AiR ##### partition samples into related and unrelatedsampset.anchored <-pcairPartition(kinobj=kingMat.anchored, kin.thresh=2^(-9/2),divobj=kingMat.anchored, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 268 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
sapply(sampset.anchored, length)
rels unrels
18 250
Show the code
# run PCA on unrelatedpca.unrel.anchored <-snpgdsPCA(gds, sample.id=sampset.anchored$unrels, snp.id=pruned.anchored)
Principal Component Analysis (PCA) on genotypes:
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
# of selected variants: 8,062
# of samples: 250
# of SNVs: 8,062
using 1 thread
# of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:38 2024 (internal increment: 3904)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:38 2024 Begin (eigenvalues and eigenvectors)
Tue Aug 6 13:47:38 2024 Done.
Show the code
# project values for relativessnp.load.anchored <-snpgdsPCASNPLoading(pca.unrel.anchored, gdsobj=gds)
SNP Loading:
# of samples: 250
# of SNPs: 8,062
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:38 2024 (internal increment: 31260)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:38 2024 Done.
Sample Loading:
# of samples: 18
# of SNPs: 8,062
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:38 2024 (internal increment: 65536)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:38 2024 Done.
Show the code
# combine unrelated and related PCs and order as in ASW ID listpcs.anchored <-rbind(pca.unrel.anchored$eigenvect, samp.load.anchored$eigenvect)rownames(pcs.anchored) <-c(pca.unrel.anchored$sample.id, samp.load.anchored$sample.id)samp.ord <-match(c(asw.ids, ceu.ids, yri.ids), rownames(pcs.anchored))pcs.anchored <- pcs.anchored[samp.ord,]## step 3b: determine which PCs are ancestry-informative ###### (make a parallel coordinates plot)# merge data with sample annotation infopc.df.anchored <-as.data.frame(pcs.anchored)names(pc.df.anchored) <-1:ncol(pcs.anchored)pc.df.anchored$sample.id <-row.names(pcs.anchored)pc.df.anchored <-left_join(pc.df.anchored, pData(annot), by='sample.id')# plotggparcoord(pc.df.anchored, columns=1:12, groupColumn="Population", scale="uniminmax") +scale_color_manual(values=pop.cols) +xlab("PC") +ylab("")
Show the code
p1 <-ggplot(pc.df.anchored, aes(`1`, `2`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC1', y ='PC2')p2 <-ggplot(pc.df.anchored, aes(`3`, `4`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC3', y ='PC4')p3 <-ggplot(pc.df.anchored, aes(`5`, `6`, color = Population)) +geom_point() +scale_color_manual(values = pop.cols) +labs(x ='PC5', y ='PC6')grid.arrange(p1,p2,p3)
# get new partitionpcrelMat.anchored <-pcrelateToMatrix(pcrel.anchored, scaleKin=1, verbose=FALSE)sampset2.anchored <-pcairPartition(kinobj=pcrelMat.anchored, kin.thresh=2^(-9/2),divobj=kingMat.anchored, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 268 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Principal Component Analysis (PCA) on genotypes:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
# of selected variants: 14,028
Excluding 10,612 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# of samples: 259
# of SNVs: 14,028
using 1 thread
# of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug 6 13:47:59 2024 (internal increment: 3768)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:59 2024 Begin (eigenvalues and eigenvectors)
Tue Aug 6 13:47:59 2024 Done.
Show the code
# calculate SNP loadingssnp.load.anchored <-snpgdsPCASNPLoading(pca.naive.anchored, gdsobj=gds)
SNP Loading:
# of samples: 259
# of SNPs: 14,028
using 1 thread
using the top 32 eigenvectors
Tue Aug 6 13:47:59 2024 (internal increment: 30172)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
Tue Aug 6 13:47:59 2024 Done.
Show the code
# plot set-upsnp.load.df.anchored <-as.data.frame(cbind(snp.load.anchored$snp.id,t(snp.load.anchored$snploading)))names(snp.load.df.anchored) <-c('snp.id', paste('PC',1:32,sep=''))missing <-!(readex.gdsn(index.gdsn(gds, 'variant.id')) %in% snp.load.anchored$snp.id)snp.annot.anchored <-data.frame(snp.id =readex.gdsn(index.gdsn(gds, 'variant.id')),chr =readex.gdsn(index.gdsn(gds, 'chromosome')),pos =readex.gdsn(index.gdsn(gds, 'position')),stringsAsFactors = F)snp.load.df.anchored <-left_join(snp.load.df.anchored, snp.annot.anchored, by ='snp.id')snp.load.df.long.anchored <- snp.load.df.anchored %>%pivot_longer(cols=PC1:PC32, names_to ='PC') %>%mutate(PC =factor(PC, levels =paste0("PC", 1:32))) %>%mutate(chr =factor(chr, levels =c(1:22, "X")))# set up color schemechr <-levels(snp.load.df.long.anchored$chr)cmap <-setNames(rep_len(brewer.pal(8, "Dark2"), length(chr)), chr)# plot first 4 PCs pagen_pcs.anchored <-length(unique(snp.load.df.long.anchored$PC))n_plots.anchored <-ceiling(n_pcs.anchored/as.integer(4)) # change 4 depending on how many plots per page you wantbins.anchored <-as.integer(cut(1:n_pcs.anchored, n_plots.anchored))i <-1bin <-paste0("PC", which(bins == i))dat.anchored <-filter(snp.load.df.long.anchored, PC %in% bin)ggplot(dat.anchored, aes(chr, value, group=interaction(chr, pos), color=chr)) +geom_point(position=position_dodge(0.8)) +facet_wrap(~PC, scales="free", ncol=1) +scale_color_manual(values=cmap, breaks=names(cmap)) +#ylim(0,1) +ylim(-0.05,0.05)+theme_bw() +theme(legend.position="none") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank()) +xlab("chromosome") +ylab("loading")
Warning: Removed 143 rows containing missing values or values outside the scale range
(`geom_point()`).